In [1]:
%matplotlib inline
import numpy as np
from scipy.io import arff
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import patsy
import statsmodels.api as sm

from sklearn import tree, linear_model, metrics, dummy, naive_bayes, neighbors

from IPython.display import Image
import pydotplus

import nltk
import gensim
import wordcloud

import pyLDAvis
pyLDAvis.enable_notebook()
import pyLDAvis.gensim

In [2]:
sns.set_context("paper")
sns.set_style("ticks")

def get_confusion_matrix(clf, X, y, verbose=True, classes=None):
    y_pred = clf.predict(X)
    cm = metrics.confusion_matrix(y_true=y, y_pred=y_pred)
    clf_report = metrics.classification_report(y, y_pred)
    if classes is None:
        classes = clf.classes_
    df_cm = pd.DataFrame(cm, columns=classes, index=classes)
    if verbose:
        print clf_report
        print df_cm
    return clf_report, df_cm

In [4]:
from sklearn.datasets import fetch_20newsgroups

In [5]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

In [6]:
len(twenty_train.data), len(twenty_test.data)


Out[6]:
(2257, 1502)

In [11]:
twenty_train.target_names


Out[11]:
['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [13]:
print(twenty_train.data[0])


From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Organization: The City University
Lines: 14

Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance.  Michael.
-- 
Michael Collier (Programmer)                 The Computer Unit,
Email: M.P.Collier@uk.ac.city                The City University,
Tel: 071 477-8000 x3769                      London,
Fax: 071 477-8565                            EC1V 0HB.


In [18]:
"\n".join(twenty_train.data[0].splitlines()[6:-5])


Out[18]:
u'Does anyone know of a good way (standard PC application/PD utility) to\nconvert tif/img/tga files into LaserJet III format.  We would also like to\ndo the same, converting to HPGL (HP plotter) files.\n\nPlease email any response.\n\nIs this the correct group?\n\nThanks in advance.  Michael.'

In [20]:
twenty_train.target_names


Out[20]:
['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [21]:
twenty_train.target[0]


Out[21]:
1

In [22]:
twenty_train.target_names


Out[22]:
['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [23]:
classification_categories = ["soc.religion.christian", "sci.med"]

In [24]:
classes = np.array(twenty_train.target_names)

In [25]:
classes


Out[25]:
array(['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian'], 
      dtype='|S22')

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

Show how to get help on functions


In [27]:
CountVectorizer?

In [28]:
count_vec = CountVectorizer(stop_words="english", token_pattern="[a-z]+")

In [29]:
X_train_counts = count_vec.fit_transform(twenty_train.data)
X_test_counts = count_vec.transform(twenty_test.data)

In [30]:
print "X_train_counts.shape =", X_train_counts.shape
print "X_test_counts.shape =", X_test_counts.shape


X_train_counts.shape = (2257, 30877)
X_test_counts.shape = (1502, 30877)

In [31]:
count_vec.vocabulary_.items()[:10]


Out[31]:
[(u'raining', 22239),
 (u'uhura', 28449),
 (u'schlegel', 24205),
 (u'hordes', 12488),
 (u'moskowitz', 17773),
 (u'foul', 10438),
 (u'tilton', 27646),
 (u'compatable', 5071),
 (u'circuitry', 4515),
 (u'pantheistic', 19750)]

In [32]:
count_vec.get_feature_names()[100:110]


Out[32]:
[u'absent',
 u'absitinence',
 u'absol',
 u'absolute',
 u'absolutely',
 u'absolutes',
 u'absolutism',
 u'absolutist',
 u'absolve',
 u'absorb']

In [33]:
clf = naive_bayes.MultinomialNB()
clf.fit(X_train_counts, twenty_train.target)
report, df_cm = get_confusion_matrix(clf, X_train_counts, twenty_train.target, classes=twenty_train.target_names)
df_cm


             precision    recall  f1-score   support

          0       1.00      1.00      1.00       480
          1       0.99      1.00      1.00       584
          2       1.00      1.00      1.00       594
          3       1.00      0.99      1.00       599

avg / total       1.00      1.00      1.00      2257

                        alt.atheism  comp.graphics  sci.med  \
alt.atheism                     479              0        0   
comp.graphics                     0            584        0   
sci.med                           0              2      592   
soc.religion.christian            0              3        0   

                        soc.religion.christian  
alt.atheism                                  1  
comp.graphics                                0  
sci.med                                      0  
soc.religion.christian                     596  
Out[33]:
alt.atheism comp.graphics sci.med soc.religion.christian
alt.atheism 479 0 0 1
comp.graphics 0 584 0 0
sci.med 0 2 592 0
soc.religion.christian 0 3 0 596

In [34]:
report, df_cm = get_confusion_matrix(clf, X_test_counts, twenty_test.target, classes=twenty_train.target_names)
df_cm


             precision    recall  f1-score   support

          0       0.93      0.91      0.92       319
          1       0.95      0.97      0.96       389
          2       0.96      0.93      0.94       396
          3       0.94      0.96      0.95       398

avg / total       0.94      0.94      0.94      1502

                        alt.atheism  comp.graphics  sci.med  \
alt.atheism                     289              4        6   
comp.graphics                     6            376        6   
sci.med                          12             12      369   
soc.religion.christian            5              4        5   

                        soc.religion.christian  
alt.atheism                                 20  
comp.graphics                                1  
sci.med                                      3  
soc.religion.christian                     384  
Out[34]:
alt.atheism comp.graphics sci.med soc.religion.christian
alt.atheism 289 4 6 20
comp.graphics 6 376 6 1
sci.med 12 12 369 3
soc.religion.christian 5 4 5 384

Using pipelines


In [35]:
from sklearn.pipeline import Pipeline

In [36]:
clf = Pipeline([
        ("vect", CountVectorizer(stop_words="english", token_pattern="[a-z]+")),
        ("nb_clf", naive_bayes.MultinomialNB())
    ])

In [37]:
X = twenty_train.data
y = twenty_train.target
classes = twenty_train.target_names
clf.fit(X, y)
report, df_cm = get_confusion_matrix(clf, X, y, classes=classes)
df_cm


             precision    recall  f1-score   support

          0       1.00      1.00      1.00       480
          1       0.99      1.00      1.00       584
          2       1.00      1.00      1.00       594
          3       1.00      0.99      1.00       599

avg / total       1.00      1.00      1.00      2257

                        alt.atheism  comp.graphics  sci.med  \
alt.atheism                     479              0        0   
comp.graphics                     0            584        0   
sci.med                           0              2      592   
soc.religion.christian            0              3        0   

                        soc.religion.christian  
alt.atheism                                  1  
comp.graphics                                0  
sci.med                                      0  
soc.religion.christian                     596  
Out[37]:
alt.atheism comp.graphics sci.med soc.religion.christian
alt.atheism 479 0 0 1
comp.graphics 0 584 0 0
sci.med 0 2 592 0
soc.religion.christian 0 3 0 596

In [38]:
clf.classes_


Out[38]:
array([0, 1, 2, 3])

In [39]:
clf.predict(twenty_test.data[:10])


Out[39]:
array([2, 2, 2, 0, 3, 0, 1, 3, 2, 2])

In [40]:
report, df_cm = get_confusion_matrix(clf, twenty_test.data, twenty_test.target, classes=classes)
df_cm


             precision    recall  f1-score   support

          0       0.93      0.91      0.92       319
          1       0.95      0.97      0.96       389
          2       0.96      0.93      0.94       396
          3       0.94      0.96      0.95       398

avg / total       0.94      0.94      0.94      1502

                        alt.atheism  comp.graphics  sci.med  \
alt.atheism                     289              4        6   
comp.graphics                     6            376        6   
sci.med                          12             12      369   
soc.religion.christian            5              4        5   

                        soc.religion.christian  
alt.atheism                                 20  
comp.graphics                                1  
sci.med                                      3  
soc.religion.christian                     384  
Out[40]:
alt.atheism comp.graphics sci.med soc.religion.christian
alt.atheism 289 4 6 20
comp.graphics 6 376 6 1
sci.med 12 12 369 3
soc.religion.christian 5 4 5 384

Using logistic regression


In [41]:
clf = Pipeline([
        ("vect", CountVectorizer(stop_words="english", token_pattern="[a-z]+")),
        ("nb_clf", linear_model.LogisticRegression(multi_class="multinomial", solver="lbfgs"))
    ])

In [42]:
X = twenty_train.data
y = twenty_train.target
classes = twenty_train.target_names
clf.fit(X, y)
report, df_cm = get_confusion_matrix(clf, X, y, classes=classes)
df_cm


             precision    recall  f1-score   support

          0       1.00      1.00      1.00       480
          1       1.00      1.00      1.00       584
          2       1.00      1.00      1.00       594
          3       1.00      1.00      1.00       599

avg / total       1.00      1.00      1.00      2257

                        alt.atheism  comp.graphics  sci.med  \
alt.atheism                     480              0        0   
comp.graphics                     0            584        0   
sci.med                           0              0      594   
soc.religion.christian            0              0        0   

                        soc.religion.christian  
alt.atheism                                  0  
comp.graphics                                0  
sci.med                                      0  
soc.religion.christian                     599  
Out[42]:
alt.atheism comp.graphics sci.med soc.religion.christian
alt.atheism 480 0 0 0
comp.graphics 0 584 0 0
sci.med 0 0 594 0
soc.religion.christian 0 0 0 599

In [43]:
report, df_cm = get_confusion_matrix(clf, twenty_test.data, twenty_test.target, classes=classes)
df_cm


             precision    recall  f1-score   support

          0       0.93      0.79      0.86       319
          1       0.88      0.95      0.91       389
          2       0.93      0.86      0.90       396
          3       0.87      0.97      0.92       398

avg / total       0.90      0.90      0.90      1502

                        alt.atheism  comp.graphics  sci.med  \
alt.atheism                     253             12       12   
comp.graphics                     3            368       13   
sci.med                          11             33      342   
soc.religion.christian            5              6        1   

                        soc.religion.christian  
alt.atheism                                 42  
comp.graphics                                5  
sci.med                                     10  
soc.religion.christian                     386  
Out[43]:
alt.atheism comp.graphics sci.med soc.religion.christian
alt.atheism 253 12 12 42
comp.graphics 3 368 13 5
sci.med 11 33 342 10
soc.religion.christian 5 6 1 386

Text exploration

Word clouds


In [44]:
text = " ".join(twenty_train.data)

In [45]:
wc = wordcloud.WordCloud(max_font_size=40, relative_scaling=.5).generate(text)

In [46]:
plt.figure()
plt.imshow(wc)
plt.axis("off")
plt.show()



In [47]:
def get_words_of_class(data, labels, c=0, ax=None):
    if ax is None:
        fig, ax = plt.subplots()
    labels = np.array(labels)
    idx = np.where(labels==c)[0]
    text = " ".join(data[i] for i in idx)
    wc = wordcloud.WordCloud(max_font_size=40, relative_scaling=.5).generate(text)
    ax.imshow(wc)
    ax.axis("off")
    return ax

In [48]:
fig, ax = plt.subplots(2,2, figsize=(10,10))
ax = ax.flatten()
labels = twenty_train.target
data = twenty_train.data
classes = twenty_train.target_names
for i, axi in enumerate(ax):
    get_words_of_class(data, labels, c=i, ax=axi)
    axi.set_title(classes[i])
fig.tight_layout()


Topic Modelling


In [49]:
from nltk.corpus import movie_reviews

In [50]:
movie_reviews.categories()


Out[50]:
[u'neg', u'pos']

In [51]:
movie_reviews.words()


Out[51]:
[u'plot', u':', u'two', u'teen', u'couples', u'go', ...]

In [52]:
sents = movie_reviews.sents()

In [53]:
len(sents)


Out[53]:
71532

In [55]:
sents[0]


Out[55]:
[u'plot',
 u':',
 u'two',
 u'teen',
 u'couples',
 u'go',
 u'to',
 u'a',
 u'church',
 u'party',
 u',',
 u'drink',
 u'and',
 u'then',
 u'drive',
 u'.']

In [57]:
movie_reviews.categories()[0]


Out[57]:
u'neg'

In [58]:
for i, s in enumerate(sents[:10]):
    print "S[%s]:\t%s" % (i, " ".join(s))


S[0]:	plot : two teen couples go to a church party , drink and then drive .
S[1]:	they get into an accident .
S[2]:	one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares .
S[3]:	what ' s the deal ?
S[4]:	watch the movie and " sorta " find out .
S[5]:	.
S[6]:	.
S[7]:	critique : a mind - fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package .
S[8]:	which is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn ' t snag this one correctly .
S[9]:	they seem to have taken this pretty neat concept , but executed it terribly .

In [61]:
bigrams = gensim.models.Phrases(sents[:1000])

In [62]:
bigrams.vocab.items()[0:10]


Out[62]:
[('unimaginative', 2),
 ('and_most', 1),
 ('"_shrewd', 1),
 ('automobile_if', 1),
 ('i_haven', 2),
 ('yellow', 2),
 ('disturbed_parental', 1),
 ('too_ditzy', 1),
 ('hanging', 2),
 ('be_one', 1)]

In [63]:
sorted(bigrams.vocab.iteritems(), key=lambda x: x[1], reverse=True)[:10]


Out[63]:
[(',', 1018),
 ('the', 1006),
 ('.', 886),
 ('a', 516),
 ("'", 510),
 ('and', 455),
 ('of', 443),
 ('to', 428),
 ('"', 326),
 ('s', 315)]

In [64]:
word_frequencies = map(lambda x: x[1], bigrams.vocab.iteritems())

In [65]:
plt.hist(word_frequencies, bins=range(0,100), log=True)
plt.xscale("symlog")



In [66]:
sorted(filter(lambda x: isinstance(x[0], str) and "_" in x[0], 
        bigrams.vocab.iteritems()), key=lambda x: x[1], reverse=True)[:30]


Out[66]:
[("'_s", 314),
 (',_and', 108),
 ("'_t", 106),
 ("it_'", 77),
 ('in_the', 76),
 ('of_the', 73),
 (',_but', 69),
 (',_the', 58),
 ('the_film', 53),
 (')_,', 46),
 (',_"', 43),
 ('the_movie', 41),
 ('to_be', 37),
 (')_.', 34),
 (',_it', 32),
 ('to_the', 31),
 ('on_the', 31),
 ('from_the', 30),
 ('is_a', 29),
 ('and_the', 28),
 ('in_a', 28),
 ('with_the', 27),
 ('._"', 27),
 ("he_'", 26),
 ('with_a', 25),
 ('as_a', 25),
 (',_a', 24),
 ('for_the', 24),
 ('of_a', 23),
 ('this_film', 23)]

In [68]:
corpus = bigrams[sents[:1000]]
id2word = gensim.corpora.Dictionary(corpus)

In [69]:
len(id2word.keys())


Out[69]:
4255

In [70]:
corpus_processed = [id2word.doc2bow(k) for k in corpus]
print len(corpus_processed)


1000

In [71]:
corpus_processed[0]


Out[71]:
[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1),
 (15, 1)]

In [72]:
corpus[0]


Out[72]:
[u'plot',
 u':',
 u'two',
 u'teen',
 u'couples',
 u'go',
 u'to',
 u'a',
 u'church',
 u'party',
 u',',
 u'drink',
 u'and',
 u'then',
 u'drive',
 u'.']

In [73]:
LDA_model = gensim.models.ldamodel.LdaModel(corpus_processed, num_topics=10, id2word=id2word)

In [76]:
LDA_model.print_topics(num_words=20)


Out[76]:
[(0,
  u"0.038*, + 0.032*the + 0.025*. + 0.025*and + 0.022*) + 0.022*- + 0.022*( + 0.017*a + 0.010*is + 0.010*of + 0.009*to + 0.009*'_s + 0.008*it + 0.007*with + 0.007*who + 0.007*that + 0.007*/_10 + 0.007*in + 0.006*on + 0.005*his"),
 (1,
  u'0.055*the + 0.042*, + 0.034*. + 0.031*" + 0.027*a + 0.019*of + 0.016*and + 0.016*to + 0.013*is + 0.012*that + 0.010*in + 0.009*) + 0.008*( + 0.008*\'_s + 0.008*for + 0.008*movie + 0.008*- + 0.006*as + 0.005*this + 0.005*by'),
 (2,
  u"0.058*. + 0.033*the + 0.030*, + 0.021*a + 0.016*in + 0.015*is + 0.013*and + 0.009*that + 0.009*for + 0.008*'_s + 0.008*of + 0.008*to + 0.007*,_but + 0.007*- + 0.005*i + 0.005*it + 0.005*it_' + 0.005*s + 0.005*not + 0.005*on"),
 (3,
  u'0.048*, + 0.044*the + 0.033*. + 0.018*a + 0.017*to + 0.016*in + 0.015*and + 0.015*of + 0.012*( + 0.012*is + 0.010*) + 0.010*" + 0.010*- + 0.008*with + 0.007*that + 0.006*as + 0.006*it + 0.006*an + 0.006*on + 0.006*for'),
 (4,
  u'0.047*. + 0.033*the + 0.023*to + 0.018*and + 0.017*of + 0.015*, + 0.014*\'_s + 0.012*" + 0.010*a + 0.010*in + 0.009*that + 0.009*is + 0.008*it + 0.008*- + 0.007*: + 0.006*) + 0.006*his + 0.005*at + 0.005*i + 0.004*,_but'),
 (5,
  u'0.045*the + 0.031*. + 0.024*, + 0.017*of + 0.015*that + 0.015*and + 0.012*to + 0.010*\'_s + 0.009*it + 0.008*a + 0.008*? + 0.007*on + 0.007*this + 0.007*in + 0.006*with + 0.006*s + 0.006*movie + 0.006*t + 0.006*- + 0.005*"'),
 (6,
  u'0.044*, + 0.025*. + 0.022*and + 0.018*is + 0.016*the + 0.016*a + 0.012*of + 0.009*in + 0.009*movie + 0.009*- + 0.008*it + 0.007*to + 0.007*that + 0.007*this + 0.006*all + 0.005*who + 0.005*" + 0.005*i + 0.005*just + 0.005*one'),
 (7,
  u"0.043*. + 0.041*the + 0.024*, + 0.020*a + 0.019*in + 0.018*to + 0.016*of + 0.012*i + 0.009*and + 0.009*is + 0.008*that + 0.008*with + 0.008*for + 0.007*- + 0.007*are + 0.007*he + 0.006*'_s + 0.005*this + 0.005*movie + 0.005*her"),
 (8,
  u'0.053*, + 0.031*the + 0.030*. + 0.025*a + 0.022*and + 0.019*\'_s + 0.018*of + 0.014*to + 0.014*in + 0.013*- + 0.013*as + 0.009*it + 0.009*" + 0.008*on + 0.008*is + 0.007*he + 0.006*that + 0.006*) + 0.006*( + 0.006*who'),
 (9,
  u'0.032*. + 0.026*the + 0.026*and + 0.022*of + 0.022*" + 0.021*a + 0.020*, + 0.015*that + 0.013*to + 0.009*in + 0.009*it + 0.009*i + 0.009*is + 0.008*this + 0.008*movie + 0.007*for + 0.007*his + 0.006*\'_s + 0.006*) + 0.006*film')]

In [75]:
LDA_model.get_document_topics(corpus_processed[0])


Out[75]:
[(4, 0.94704933837128213)]

In [77]:
doc_topics = LDA_model[corpus_processed]

In [78]:
doc_topics[1]


Out[78]:
[(0, 0.01428764777370942),
 (1, 0.014288378183196067),
 (2, 0.014288441454276673),
 (3, 0.87140621304095234),
 (4, 0.014288298442854355),
 (5, 0.014287667479202367),
 (6, 0.014288274275986387),
 (7, 0.014287818823966531),
 (8, 0.014289939602374114),
 (9, 0.014287320923481802)]

In [79]:
pyLDAvis.gensim.prepare(LDA_model, corpus_processed,
                        id2word)


Out[79]:

POS tagging


In [81]:
text = nltk.word_tokenize("And now for something completely different")
nltk.pos_tag(text)


Out[81]:
[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('something', 'NN'),
 ('completely', 'RB'),
 ('different', 'JJ')]

NER


In [82]:
text = nltk.word_tokenize("US president Barack Obama signed a new treaty with the Indian prime minister Narendra Modi, in New Delhi.")
pos_tags = nltk.pos_tag(text)
print pos_tags


[('US', 'NNP'), ('president', 'NN'), ('Barack', 'NNP'), ('Obama', 'NNP'), ('signed', 'VBD'), ('a', 'DT'), ('new', 'JJ'), ('treaty', 'NN'), ('with', 'IN'), ('the', 'DT'), ('Indian', 'JJ'), ('prime', 'JJ'), ('minister', 'NN'), ('Narendra', 'NNP'), ('Modi', 'NNP'), (',', ','), ('in', 'IN'), ('New', 'NNP'), ('Delhi', 'NNP'), ('.', '.')]

In [84]:
try:
    chunk_tags = nltk.ne_chunk(pos_tags, binary=False)
except:
    print "Done"
print chunk_tags


(S
  (GPE US/NNP)
  president/NN
  (PERSON Barack/NNP Obama/NNP)
  signed/VBD
  a/DT
  new/JJ
  treaty/NN
  with/IN
  the/DT
  (GPE Indian/JJ)
  prime/JJ
  minister/NN
  (PERSON Narendra/NNP Modi/NNP)
  ,/,
  in/IN
  (GPE New/NNP Delhi/NNP)
  ./.)

In [85]:
from nltk.corpus import wordnet as wn

In [88]:
wn.synsets('dog.n.01')


Out[88]:
[]

In [ ]: